This script reproduces all plots for Figure 1 and associated supplementary figures in PaperName. Preprocessing of raw RNA sequencing data is done in ./Folder/script, that yields allele-specific expression counts which are combined into a SingleCellExperiment (SCE) object in ./Folder/script.R. [Insert in Figure 4: CnR + methylation data preprocessing]. We first load the necessary libraries and some helper functions:

setwd("~/Desktop/Projects/XChromosome_Antonia/")

library(tidyverse)
library(ggplot2)
library(scran)
library(scater)
library(DESeq2)
library(patchwork)

source("./Scripts/auxiliary.R")

# General theme for all plots: 
theme_paper <- function(){
  theme(panel.background = element_blank(),
        panel.grid = element_line(colour= "grey92"), 
        panel.grid.minor = element_line(size = rel(0.5)),
        axis.line = element_line(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA, size=1), 
        text = element_text(size = 20))
}

escape_colors <- setNames(c("grey", "darkgreen", "orange"), nm = c("silenced / variable", "facultative", "constitutive"))

Now, read in the SCE, remove non-X genes and exclude lowly expressed genes

data <- readRDS("./ProcessedData/merged_dataset_paper.rds")
data_before_filtering <-  data[seqnames(data) == "X", ]

data <- data[(rowSums(counts_inactive(data)) + rowSums(counts_active(data))) / ncol(data) > 10, ]

data_with_autosomes <- data
data <- data[seqnames(data) == "X", ]

For the supplement, generate plots showing a) how many genes we detect with sufficient allele-specific coverage and b) allelic densities of autosomal and X-linked genes. Finally, we look at escape measured by d-scores (d = inact / (inact + act) = B6 / (B6 / CAST)) in two cell lines.

data_baseline_x <- data[,data$Condition == "Aux_0_Dox_0_WO_0_WOAuxNO"]

# Show for one C30 example: 
sample_show = 1
colnames(data_before_filtering)[[sample_show]]
## [1] "CL30.NoDoxNoAux"
data.frame(
  counts_total = counts(data_before_filtering[,sample_show])[[1]], 
  counts_allelic = (counts_active(data_before_filtering[,sample_show]) +  counts_inactive(data_before_filtering[,sample_show]))[[1]]
) %>%
  ggplot(aes(x = counts_total + 1, y = counts_allelic + 1)) + geom_point() + scale_x_log10() + scale_y_log10() + 
  theme_paper() + geom_vline(xintercept = 10, linetype = "dashed") + xlab("Total reads mapped") + ylab("Allele-specific reads mapped") + 
  geom_abline(linetype = "dashed")

ggsave("./FiguresIllustrator/FigS1/mapped_reads_c30.pdf")

data.frame(
  chromosome = as.character(seqnames(rowRanges(data_with_autosomes))) == "X", 
  total = (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show]))[[1]], 
  ase =  (counts_active(data_with_autosomes[,sample_show]) / (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show])))[[1]]
) %>%
  ggplot(aes(x = ase, fill = chromosome)) + geom_density() + theme_paper() + ylab("Density") + xlab("B6 / (B6 + CAST)")

ggsave("./FiguresIllustrator/FigS1/density_ase_c30.pdf")

# Show for one E6 example: 
sample_show = 31
colnames(data_before_filtering)[[sample_show]]
## [1] "C.E6.NoDox"
data.frame(
  counts_total = counts(data_before_filtering[,sample_show])[[1]], 
  counts_allelic = (counts_active(data_before_filtering[,sample_show]) +  counts_inactive(data_before_filtering[,sample_show]))[[1]]
) %>%
  ggplot(aes(x = counts_total + 1, y = counts_allelic + 1)) + geom_point() + scale_x_log10() + scale_y_log10() + 
  theme_paper() + geom_vline(xintercept = 10, linetype = "dashed") + xlab("Total reads mapped") + ylab("Allele-specific reads mapped") + 
  geom_abline(linetype = "dashed")

ggsave("./FiguresIllustrator/FigS1/mapped_reads_e6.pdf")

data.frame(
  chromosome = as.character(seqnames(rowRanges(data_with_autosomes))) == "X", 
  total = (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show]))[[1]], 
  ase =  (counts_active(data_with_autosomes[,sample_show]) / (counts_active(data_with_autosomes[,sample_show]) + counts_inactive(data_with_autosomes[,sample_show])))[[1]]
) %>%
  ggplot(aes(x = ase, fill = chromosome)) + geom_density() + theme_paper() + ylab("Density") + xlab("B6 / (B6 + CAST)")

ggsave("./FiguresIllustrator/FigS1/density_ase_e6.pdf")

# Plot escape in individual clones
sample_nr = 1 # Clone 30

data.frame(
  total = rowSums(counts_inactive(data_baseline_x[,sample_nr])) + rowSums(counts_active(data_baseline_x[,sample_nr])),
  ase = rowSums(counts_inactive(data_baseline_x[,sample_nr])) / (rowSums(counts_inactive(data_baseline_x[,sample_nr])) +
                                                                   rowSums(counts_active(data_baseline_x[,sample_nr])))
) %>%
  rownames_to_column("Gene") %>%
  ggplot(aes(total, ase)) + geom_point() + ggrepel::geom_text_repel(aes(label = Gene)) +
  theme_paper() + scale_x_log10() + geom_hline(yintercept = 0.1, linetype = "dashed") + ggtitle("Clone 30") +
  xlab("Total expression (allelic reads)") + ylab("ASE (d-score)")

ggsave("./FiguresIllustrator/FigS1/cl30_escape.pdf", width = 12, height = 8)

sample_nr = 3 # E6

data.frame(
  total = rowSums(counts_inactive(data_baseline_x[,sample_nr])) + rowSums(counts_active(data_baseline_x[,sample_nr])),
  ase = rowSums(counts_inactive(data_baseline_x[,sample_nr])) / (rowSums(counts_inactive(data_baseline_x[,sample_nr])) +
                                                                   rowSums(counts_active(data_baseline_x[,sample_nr])))
) %>%
  rownames_to_column("Gene") %>%
  ggplot(aes(total, ase)) + geom_point() + ggrepel::geom_text_repel(aes(label = Gene)) +
  theme_paper() + scale_x_log10() + geom_hline(yintercept = 0.1, linetype = "dashed") + ggtitle("Clone 30") +
  xlab("Total expression (allelic reads)") + ylab("ASE (d-score)")

ggsave("./FiguresIllustrator/FigS1/e6_escape.pdf", width = 12, height = 8)

We now quantify the number of genes which escape within and across clones and cell lines.

data_here <- data[,data$ConditionClean == "Control"]

# get d-scores across genes + samples
ratios <- counts_inactive(data_baseline_x) / (counts_inactive(data_baseline_x) + counts_active(data_baseline_x))

# Genes with average ASE > 0.7 in the control sample are likely mapping artefacts: 
genes_out <- names(rowMeans(ratios, na.rm = T)[rowMeans(ratios, na.rm = T) > 0.8 & rownames(ratios) != "Xist"])
genes_out_na <- rownames( ratios[!apply(ratios, 1, function(x){!any(is.na(x))}), ] )

genes_out <- c(genes_out, genes_out_na)

ratios <- ratios[!rownames(ratios) %in% genes_out, ]

colnames(ratios) <- c("CL30", "CL31", "E6 (rep1)", "E6 (rep2)", "E6 (rep3)", "CL31.16", "CL30.7")

genes_show <- c("Xist", "Mecp2", "Kdm6a", "Kdm5c")
genes_of_choice <- setNames(rep("", nrow(ratios)), rownames(ratios))
genes_of_choice[genes_show] <- genes_show

# Plot d-scores for genes after ordering the genes by chromosomale coordinate
ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  dplyr::filter(Condition == "Control") %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  ggplot(aes(Clone, y = reorder(name, position), fill = value)) + geom_tile(width = 0.9) +
  theme_paper() + theme(axis.text.x=element_text(angle = 90, hjust = 1)) + 
  theme(axis.ticks.y = element_blank()) + xlab("") + ylab("Chromosome position") + 
  scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
  scale_y_discrete(labels = genes_of_choice, position = "right") + ylab("") + labs(fill = "d-score")

ggsave("./FiguresIllustrator/FigS1/heatmap_ordered.pdf", width = 10, height = 10)

ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  dplyr::filter(Condition == "Control") %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  ggplot(aes(Clone, y = reorder(name, position), fill = value)) + geom_tile(width = 0.9) +
  theme_paper() + theme(axis.text.x=element_text(angle = 90, hjust = 1)) + 
  theme(axis.ticks.y = element_blank()) + xlab("") + ylab("Chromosome position") + 
  scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
  scale_y_discrete(labels = genes_of_choice, position = "left") + ylab("") + labs(fill = "d-score") + coord_flip()

ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_streched.pdf", width = 30, height = 10)

# Plot d-scores for genes that escape in any sample, excluding genes with > 0.7 d-score
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
ratios <- ratios[order(rowMeans(ratios), decreasing = T), ]

# Now we define the baseline escapees per sample as the genes with d-score > 0.1, this will be used throughout the analysis
escapees <- apply(ratios, 2, function(x){x > 0.1})
escapees_per_clone <- apply(escapees, 2, function(x){rownames(escapees)[x]})
escapees_per_clone <- lapply(escapees_per_clone, function(x){x[x != "Xist"]})
names(escapees_per_clone) <- c("CL30 - December2021", "CL31 - December2021", "E6 - October2021", "E6 - March2022", "E6 - September2022", 
                               "CL30 - October2022", "CL31 - October2022")

# How many do we get per clone and do they overlap?
ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  dplyr::filter(Condition == "Control") %>%
  dplyr::filter(value > 0.1) %>%
  group_by(name) %>%
  mutate(escape_in_clones = n() >= 7) %>% # CAVE - here we need to adjust
  ggplot(aes(x = Clone, fill = escape_in_clones)) + geom_bar() + theme_paper() + 
  theme(axis.text.x=element_text(angle = 45, hjust = 1)) + xlab("") + ylab("Number of genes with d-score > 0.1") + 
  scale_fill_manual(values = c("grey", "darkred"), name = "Escape in all samples")

ggsave("./FiguresIllustrator/FigS1/number_of_escapees_per_clone.pdf", width = 8, height = 8)

ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  dplyr::filter(Condition == "Control") %>%
  dplyr::filter(value > 0.1) %>%
  group_by(name) %>%
  mutate(escape_in_clones = n() >= 7) %>%
  add_column(escape_group = rowData(data[.$name, ])$EscapeAnnotation) %>%
  ggplot(aes(x = Clone, fill = escape_in_clones)) + geom_bar() + theme_paper() + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1)) + xlab("") + ylab("Number of genes with d-score > 0.1") + 
    scale_fill_manual(values = c("grey", "darkred"), name = "Escape in all clones") + facet_wrap(~escape_group)

ggsave("./FiguresIllustrator/FigS1/number_of_escapees_per_clone_stratified.pdf", width = 8, height = 8)

For the main figure, we now plot escape during Xist-overexpression (Fig1 a-…)

# Subset data on relevant samples and for the main figure, only show E6
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)",  "Dox (21d)")]
data_here <- data_here[,data_here$Clone == "E6"]

ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]

# For Xist-expression, we need a scaling factor per sample to account for differences in sequencing depth
size_factors <- colSums(counts(data_with_autosomes)) / colSums(counts(data_with_autosomes))[[1]]

# check that Xist-overexpression works by plotting Xist counts per million (CPM) per sample
data.frame(
  Dox = data_here$ndDox,
  Clone = paste0(data_here$Clone, " - ", data_here$Experiment), 
  Expression = as.numeric(counts(data_here["Xist", ]) / colSums(counts(data_here)) * 1e6)
) %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  ggplot(aes(x = Condition, y = Expression)) + 
  stat_summary(stat = "mean", geom = "bar", fill = "grey", col = "black") + 
    coord_flip() + theme_paper() + 
    xlab("") + ylab("Expression (CPM)") + geom_jitter(size = 3, width = 0.1) + 
    scale_y_continuous(expand = c(0, 0), limits = c(0, 320000)) + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

ggsave("./FiguresIllustrator/Fig1/xist_overexpression.pdf", width = 4, height = 4)

# For the next heatmap, we want to highlight a few genes
genes_show <- c("Xist", "Mecp2", "Kdm6a", "Kdm5c")
genes_of_choice <- setNames(rep("", nrow(ratios)), rownames(ratios))
genes_of_choice[genes_show] <- genes_show

# Heatmap showing d-scores of all genes across Xist-OE
ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
  ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_y_discrete(labels = genes_of_choice, position = "right") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    scale_x_discrete(labels = genes_of_choice, position = "bottom") + 
    labs(fill = "d-score") + guides(fill = "none")

ggsave("./FiguresIllustrator/Fig1/heatmap_ordered.pdf", width = 6, height = 4)

# Get escapees that are consistent across E6 samples
e6_escapees <- Reduce("intersect", escapees_per_clone[3:5])

# For the supplement, also plot stratified by replicates: 
ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
  ggplot(aes(reorder(name, position), y = interaction(Clone, Condition, sep = " -- "), group = Clone, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_y_discrete(labels = genes_of_choice, position = "right") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    scale_x_discrete(labels = genes_of_choice, position = "bottom") + 
    labs(fill = "d-score")

ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_by_clones.pdf", width = 10, height = 10)

# Now, we show a heatmap with only the escapees

# To structure the data, we order the genes using hierarchical clustering
data_for_clustering <- do.call("rbind", lapply(unique(data_here$ConditionClean), function(x){
  if (sum(data_here$ConditionClean == x) == 1){
    ratios[e6_escapees, data_here$ConditionClean == x]
  } else {
    rowMeans(ratios[e6_escapees, data_here$ConditionClean == x])
  }
}))

clustering_order <- setNames(hclust(dist(t(data_for_clustering)))$order, rownames(ratios[e6_escapees, ]))

ratios[e6_escapees, ][clustering_order, ] %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  mutate(name = factor(name, levels = unique(.$name))) %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) -> to_plot

p1 <- to_plot %>%
  ggplot(aes(name, y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    labs(fill = "d-score") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) + 
    scale_x_discrete(position = "top") + guides(fill = "none")

# Plot escapee annotation at the bottom of this: 
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)),  Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
  ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) + 
  theme(legend.position = "None")

p1 / p2 + plot_layout(heights = c(4, 0.2))

ggsave("./FiguresIllustrator/Fig1/heatmap_ordered_only_escapees.pdf", width = 20, height = 4)

# Also do this reordering
p1 <- to_plot %>%
  ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    labs(fill = "d-score") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) + 
    scale_x_discrete(position = "top") + guides(fill = "none")

# Plot escapee annotation at the bottom of this: 
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)),  Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
  ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) + 
  theme(legend.position = "None")

p1 / p2 + plot_layout(heights = c(4, 0.2))

ggsave("./FiguresIllustrator/Fig1/heatmap_ordered_only_escapees_by_position.pdf", width = 20, height = 4)

# Quantify for genes that escape in E6 in the Control sample
ratios[e6_escapees, ] %>%
  t() %>% data.frame() %>%
  add_column("Dox" = data_here$ndDox) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Dox, Clone)) %>%
  group_by(Clone) %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
  mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
  dplyr::filter(!is.na(escape_status)) %>%
  ggplot(aes(x = Condition, y = value, fill = escape_status)) + 
    geom_boxplot(col = "black", outlier.color = "grey") + 
    theme_paper() + ylab("ASE (d-score)") + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1)) + 
    scale_fill_manual(values = c("grey", "darkgreen", "orange"), labels = c("Silenced / Variable", "Facultative", "Constitutive")) + 
    labs(fill = "Escape Category") + xlab("")

ggsave("./FiguresIllustrator/Fig1/escapee_silencing_boxplots.pdf", width = 9, height = 6)

ratios %>%
  t() %>% data.frame() %>%
  add_column("Dox" = data_here$ndDox) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Dox, Clone)) %>%
  group_by(Clone) %>%
  dplyr::filter(name %in% escapees_per_clone[[unique(Clone)]]) %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
  mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
  dplyr::filter(!is.na(escape_status)) %>%
  ggplot(aes(x = Condition, y = value, fill = Clone)) + 
    geom_boxplot(col = "black", outlier.color = "grey", position = position_dodge2(preserve = "single")) + 
    theme_paper() + ylab("ASE (d-score)") + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1)) + 
    labs(fill = "Escape Category") + xlab("")

ggsave("./FiguresIllustrator/FigS1/escapee_silencing_boxplots_per_clone.pdf", width = 9, height = 6)





### For supplement, plot the heatmap etc in CL30/CL31
data_here <- data[,data$Condition %in% c("Aux_0_Dox_0_WO_0_WOAuxNO", "Aux_0_Dox_3_WO_0_WOAuxNO", "Aux_0_Dox_7_WO_0_WOAuxNO", 
                                         "Aux_0_Dox_14_WO_0_WOAuxNO", "Aux_0_Dox_21_WO_0_WOAuxNO")]
data_here <- data_here[,data_here$Clone != "E6"]

ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]

# For Xist-expression, we need a scaling factor per sample to account for differences in seq depth
size_factors <- colSums(counts(data_with_autosomes)) / colSums(counts(data_with_autosomes))[[1]]

# check that Xist-overexpression works
data.frame(
  Dox = data_here$ndDox,
  Clone = paste0(data_here$Clone, " - ", data_here$Experiment), 
  Expression = as.numeric(counts(data_here["Xist", ]) / colSums(counts(data_here)) * 1e6)
) %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  ggplot(aes(x = Condition, y = Expression)) + 
  stat_summary(stat = "mean", geom = "bar", fill = "grey", col = "black") + 
  coord_flip() + theme_paper() + 
  xlab("") + ylab("Xist Expression (normalized CPM)") + geom_jitter(size = 3, width = 0.1) + 
  scale_y_continuous(expand = c(0, 0), limits = c(0, 260000))

ggsave("./FiguresIllustrator/FigS1/xist_overexpression_cl30_31.pdf", width = 6, height = 8)

genes_show <- c("Xist", "Mecp2", "Kdm6a", "Kdm5c")
genes_of_choice <- setNames(rep("", nrow(ratios)), rownames(ratios))
genes_of_choice[genes_show] <- genes_show

ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
  ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_y_discrete(labels = genes_of_choice, position = "right") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    scale_x_discrete(labels = genes_of_choice, position = "bottom") + 
    labs(fill = "d-score")

ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_cl30_cl31.pdf", width = 10, height = 10)

ratios %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  #dplyr::filter(value < 0.7) %>%
  #dplyr::filter(Clone == "E6") %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) %>%
  ggplot(aes(reorder(name, position), y = interaction(Clone, Condition, sep = " -- "), group = Clone, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_y_discrete(labels = genes_of_choice, position = "right") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    scale_x_discrete(labels = genes_of_choice, position = "bottom") +
    labs(fill = "d-score")

ggsave("./FiguresIllustrator/FigS1/heatmap_ordered_by_clones_cl30_cl31.pdf", width = 10, height = 10)

escapees_cl30 <- Reduce("intersect", escapees_per_clone[-c(3:5)])
ratios[escapees_cl30, ] %>%
  t() %>% data.frame() %>%
  add_column("Dox" = data_here$ndDox) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Dox, Clone)) %>%
  group_by(Clone) %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
  mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
  dplyr::filter(!is.na(escape_status)) %>%
  ggplot(aes(x = Condition, y = value, fill = escape_status)) + 
    geom_boxplot(col = "black", outlier.color = "grey") + 
    theme_paper() + ylab("ASE (d-score)") + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1)) + 
    scale_fill_manual(values = c("grey", "darkgreen", "orange"), labels = c("Silenced / Variable", "Facultative", "Constitutive")) + 
    labs(fill = "Escape Category") + xlab("")

ggsave("./FiguresIllustrator/Fig1/escapee_silencing_boxplots_cl30_cl31.pdf", width = 9, height = 6)

ratios %>%
  t() %>% data.frame() %>%
  add_column("Dox" = data_here$ndDox) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Dox, Clone)) %>%
  group_by(Clone) %>%
  dplyr::filter(name %in% escapees_per_clone[[unique(Clone)]]) %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
  mutate(escape_status = factor(escape_status, c("silenced / variable", "facultative", "constitutive"))) %>%
  dplyr::filter(!is.na(escape_status)) %>%
  ggplot(aes(x = Condition, y = value, fill = Clone)) + 
    geom_boxplot(col = "black", outlier.color = "grey", position = position_dodge2(preserve = "single")) + 
    theme_paper() + ylab("ASE (d-score)") + 
    theme(axis.text.x=element_text(angle = 45, hjust = 1)) + 
    labs(fill = "Escape Category") + xlab("")

ggsave("./FiguresIllustrator/FigS1/escapee_silencing_boxplots_per_clone_cl30_cl31.pdf", width = 9, height = 6)


############ 
# To structure the data, we order the genes using hierarchical clustering
data_for_clustering <- do.call("rbind", lapply(unique(data_here$ConditionClean), function(x){
  if (sum(data_here$ConditionClean == x) == 1){
    ratios[escapees_cl30, data_here$ConditionClean == x]
  } else {
    rowMeans(ratios[escapees_cl30, data_here$ConditionClean == x])
  }
}))

clustering_order <- setNames(hclust(dist(t(data_for_clustering)))$order, rownames(ratios[escapees_cl30, ]))

ratios[escapees_cl30, ][clustering_order, ] %>%
  t() %>% data.frame(check.names = F) %>%
  add_column("Condition" = data_here$ConditionClean) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Condition, Clone)) %>%
  mutate(name = factor(name, levels = unique(.$name))) %>%
  add_column(position = start(rowRanges(data_here[as.character(.$name), ]))) %>%
  mutate(Condition = factor(Condition, levels = rev(c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")))) -> to_plot

p1 <- to_plot %>%
  ggplot(aes(name, y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    labs(fill = "d-score") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) + 
    scale_x_discrete(position = "top") + guides(fill = "none")

# Plot escapee annotation at the bottom of this: 
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)),  Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
  ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) + 
  theme(legend.position = "None")

p1 / p2 + plot_layout(heights = c(4, 0.2))

ggsave("./FiguresIllustrator/FigS1/cl30_heatmap_ordered_only_escapees.pdf", width = 20, height = 4)

# Also do this reordering
p1 <- to_plot %>%
  ggplot(aes(reorder(name, position), y = Condition, fill = value, col = value)) + geom_tile(width = 0.9) +
    scale_fill_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    scale_colour_gradient2(midpoint = 0.5, mid = "#117733", low = "#BEBEBE", high = "black", limits = c(0, 1), na.value = "white") + 
    theme_paper() + 
    theme(axis.ticks.x = element_blank(), panel.grid.major = element_blank()) + 
    xlab("") + 
    ylab("") + theme(panel.border = element_blank(), axis.line=element_blank()) + 
    labs(fill = "d-score") + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0, size = 8)) + 
    scale_x_discrete(position = "top") + guides(fill = "none")

# Plot escapee annotation at the bottom of this: 
p2 <- data.frame(Gene = factor(e6_escapees, levels = levels(to_plot$name)),  Annotation = rowData(data)[e6_escapees, ]$EscapeAnnotation) %>%
  ggplot(aes(x = Gene, y = 1, fill = Annotation)) + theme_void() + geom_tile(col = "black") + scale_fill_manual(values = escape_colors) + 
  theme(legend.position = "None")

p1 / p2 + plot_layout(heights = c(4, 0.2))

ggsave("./FiguresIllustrator/Fig1/cl30_heatmap_ordered_only_escapees_by_position.pdf", width = 20, height = 4)

For the supplement, we look at some genes as examples:

# Subset data on relevant samples and for the main figure, only show E6
data_here <- data[,data$ConditionClean %in% c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)",  "Dox (21d)")]
data_here <- data_here[,data_here$Clone == "E6"]

ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]

data_test_inactive <- counts_inactive(data_here)
data_test_active <- counts_active(data_here)

# plot_gene <- function(gene, save = F, path = NULL, suffix = "_escape_plot.pdf"){
#   
#   i = gene
#   y = as.numeric(data_test_inactive[i, ])
#   N = as.numeric(data_test_active[i, ] + data_test_inactive[i, ])
#   
#   pp  <- data.frame(
#     y = y,
#     N = N,
#     Condition = data_here$ConditionClean,
#     Dox = data_here$ndDox,
#     Washout = data_here$washout, 
#     Clone = paste0(data_here$Clone, " - ", data_here$Experiment)
#   ) %>%
#     mutate(Condition = factor(Condition, levels = c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)"))) %>%
#     ggplot(aes(x = Condition, y = y / N, col = Clone)) + geom_point(size = 4) + 
#       ylim(0, 1) +
#       geom_line(aes(group = Clone), linetype = 'dashed') + theme_paper() + 
#       xlab("") + ylab("ASE (d-score)") + 
#       #scale_x_discrete(labels = c("Dox + washout", "Dox only", "Control")) + 
#       NULL
#   pp
#   
#   if (save){
#     ggsave(paste0(path, "./", gene, suffix), pp)
#   } 
#   
#   return(pp)
# }
# 
# plot_gene("Kdm5c", save = T, "~/Desktop/Projects/XChromosome_Antonia/Figures_new/Fig1/", "_escape_all_clones.pdf")
# plot_gene("Kdm6a", save = T, "~/Desktop/Projects/XChromosome_Antonia/Figures_new/Fig1/", "_escape_all_clones.pdf")

To make sure we observe substantial reduction in gene expression, we use DESeq2 to test for differences in escapee expression between Control and 7d. We do this either for combined, only inactive and only active counts:

data_here <- data[,data$ConditionClean %in% c("Control", "Dox (7d)")]
data_here <- data_here[,data_here$Clone == "E6"]
data_here <- data_here[unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])), ]
data_here <- data_here[!rownames(data_here) %in% genes_out, ]

data_for_sf <- data_with_autosomes[,colnames(data_here)]

size_factors_global <- DESeq2::estimateSizeFactors(DESeqDataSetFromMatrix(counts(data_for_sf), colData = DataFrame(Cond = rep(1, 6)), design = ~0))
size_factors_x <- DESeq2::estimateSizeFactors(DESeqDataSetFromMatrix(counts_active(data_here) + counts_inactive(data_here), colData = DataFrame(Cond = rep(1, 6)), design = ~0))

deseq_inactive <- DESeqDataSetFromMatrix(countData = counts_inactive(data_here), colData = colData(data_here), design = ~ Experiment + ConditionClean)
deseq_inactive$sizeFactor <- size_factors_global$sizeFactor
deseq_inactive <- DESeq(deseq_inactive)

deseq_active <- DESeqDataSetFromMatrix(countData = counts_active(data_here), colData = colData(data_here), design = ~ Experiment + ConditionClean)
deseq_active$sizeFactor <- size_factors_global$sizeFactor
deseq_active <- DESeq(deseq_active)

deseq_total <- DESeqDataSetFromMatrix(countData = counts(data_here), colData = colData(data_here), design = ~ Experiment + ConditionClean)
deseq_total$sizeFactor <- size_factors_global$sizeFactor
deseq_total <- DESeq(deseq_total)

df_inactive <- results(deseq_inactive) %>%
  data.frame() %>%
  rownames_to_column("Gene") %>%
  add_column(Status = "Inactive")

df_active <- results(deseq_active) %>%
  data.frame() %>%
  rownames_to_column("Gene") %>%
  add_column(Status = "Active")

df_total <- results(deseq_total) %>%
  data.frame() %>%
  rownames_to_column("Gene") %>%
  add_column(Status=  "Total")

rbind(df_inactive, rbind(df_active, df_total)) %>%
  ggplot(aes(x = baseMean, y = log2FoldChange, col = padj < 0.1)) + geom_point() + scale_x_log10() + 
    ggrepel::geom_text_repel(aes(label = Gene)) + xlab("Mean Expression (log normalized counts)") + ylab("log2 (7d dox / Control)") + 
    theme_paper() + facet_wrap(~Status) + scale_color_manual(values = c("black", "red"))

ggsave("./FiguresIllustrator/FigS1/escape_deseq_analysis.pdf", width = 9, height = 6)

# Visualize also as a heatmap: 

df_combined <- rbind(df_inactive, rbind(df_active, df_total))
df_combined$genomic_position <- start(rowRanges(data[df_combined$Gene, ]))

df_combined %>% 
  ggplot(aes(x = 1, y = reorder(Gene, genomic_position), fill = log2FoldChange)) + geom_tile(col = "black") + facet_wrap(~Status) + 
    theme_paper() + scale_fill_gradient2() + geom_text(aes(label = ifelse(padj < 0.1, ".", "")), size = 10, nudge_y = 0.7) + 
    theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggsave("./FiguresIllustrator/FigS1/escape_deseq_analysis_heatmap.pdf", width = 9, height = 30)

Finally, we test whether the strength of silencing is associated with the genes expression level or escape level (also stratified by escapee category to avoid simpsons paradox.)

data_here <- data[,data$ConditionClean %in% c("Control", "Dox (7d)")]

ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]
ratios <- ratios[unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])), ]

data_here <- data_here[unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])), ]

# collect expression levels and escape levels
escape_colors <- setNames(c("grey", "darkgreen", "orange"), nm = c("silenced / variable", "facultative", "constitutive"))

df_here <- data.frame(
  EscapeCategory = rowData(data_here)$EscapeAnnotation, 
  BasalExpressionLevel = rowMeans(counts(data_here[,data_here$ConditionClean == "Control"])), 
  BasalInactiveExpressionLevel = rowMeans(counts_inactive(data_here[,data_here$ConditionClean == "Control"])), 
  BasalEscape = rowMeans(ratios[,data_here$ConditionClean == "Control"]), 
  EscapeDifference = rowMeans(ratios[,data_here$ConditionClean == "Dox (7d)"]) - rowMeans(ratios[,data_here$ConditionClean == "Control"])
) 


df_here %>% ggplot(aes(x = BasalExpressionLevel, y = BasalEscape, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_hline(yintercept = 0.5, linetype = 'dashed', col = "grey")

df_here %>% ggplot(aes(x = BasalExpressionLevel, y = BasalEscape, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')  + 
  geom_hline(yintercept = 0.5, linetype = 'dashed', col = "grey")

ggsave("./FiguresIllustrator/FigS1/basal_exp_vs_basal_escape.pdf", width = 9, height = 6)


df_here %>% ggplot(aes(x = BasalExpressionLevel, y = EscapeDifference, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors)

df_here %>% ggplot(aes(x = BasalExpressionLevel, y = EscapeDifference, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')

ggsave("./FiguresIllustrator/FigS1/basal_exp_vs_basal_escape_diff.pdf", width = 9, height = 6)


df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = BasalEscape, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors)

df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = BasalEscape, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')

df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = EscapeDifference, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors)

df_here %>% ggplot(aes(x = BasalInactiveExpressionLevel, y = EscapeDifference, col = EscapeCategory)) + 
  geom_point() + scale_x_log10() + theme_paper() + scale_color_manual(values = escape_colors) + geom_smooth(method = 'lm', linetype = 'dashed')

# There is no relationship between these quantities

Finally, export the d-score changes across genes to a csv

data_here <- data[,data$ConditionClean %in% c("Control", "Dox (3d)", "Dox (7d)", "Dox (14d)", "Dox (21d)")]
data_here <- data_here[,data_here$Clone == "E6"]

ratios <- counts_inactive(data_here) / (counts_inactive(data_here) + counts_active(data_here))
ratios <- ratios[!rownames(ratios) %in% genes_out, ]

ratios %>%
  t() %>% data.frame() %>%
  add_column("Dox" = data_here$ndDox) %>%
  add_column("Clone" = paste0(data_here$Clone, " - ", data_here$Experiment)) %>%
  pivot_longer(-c(Dox, Clone)) %>%
  group_by(Clone) %>%
  dplyr::filter(name %in% escapees_per_clone[[unique(Clone)]]) %>%
  ungroup() %>%
  add_column(Condition = ifelse(.$Dox == 0, "Control", paste0("Dox (", .$Dox, " days)"))) %>%
  mutate(Condition = factor(Condition, levels = (c("Control", paste0("Dox (", c(3, 7, 14, 21), " days)"))))) %>%
  add_column(escape_status = unlist(rowData(data_here)[match(.$name, rownames(data_here)), ]$EscapeAnnotation)) %>%
  dplyr::filter(!is.na(escape_status)) %>%
  dplyr::select(-c(Dox, Clone)) %>%
  group_by(name, Condition) %>%
  summarize(value = mean(value), escape_status = unique(escape_status)) %>%
  ungroup() %>%
  pivot_wider(values_from = value, names_from = Condition) %>%
  mutate_at(vars(matches("Dox")), list(dif = ~ . - Control)) %>%
  write.csv("./ProcessedData/d_score_differences.csv")

Finally, we fit decay rates to the 0-21d timecourse. We use two models, one with and one without offset, to distinguish genes that get fully silenced vs genes that retain residual escape.

escapees_use <- as.character(unique(do.call("c", escapees_per_clone[grepl("E6", names(escapees_per_clone))])))

# Get relevant data, we only use E6 here
data_here <- data[,data$Condition %in% c("Aux_0_Dox_0_WO_0_WOAuxNO", "Aux_0_Dox_3_WO_0_WOAuxNO", "Aux_0_Dox_7_WO_0_WOAuxNO", 
                                         "Aux_0_Dox_14_WO_0_WOAuxNO", "Aux_0_Dox_21_WO_0_WOAuxNO")]
data_here <- data_here[,data_here$Clone == "E6"]

# define the exponential decay functions
exp_decay <- function(x, a, k) a * exp(-k*x) 
offset_exp_decay <- function(x, a, k, c) a * exp(-k*x) + c

# these functions perform the fit per gene. We stabilize the rates with a pseudocount of 1 and use non-linear regression to fit decay curves to the data.
# We restrict the parameter space to k > 0 and b > 0
fit_nls_curve_for_gene <- function(gene){
  df_here <- data.frame(
    active = as.numeric(counts_active(data_here)[gene, ]) + 1, 
    inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1, 
    timepoint = as.numeric(data_here$ndDox), 
    experiment = data_here$Experiment
  ) %>% mutate(total = active + inactive) %>%
    mutate(rate = inactive / (active + inactive))
  tryCatch({
    rates.nls <- nls(rate ~ a * exp(- k * timepoint), data = df_here, algorithm = "port", start = c(a = 0.5, k = 0.1), lower = c(a = -Inf, k = 0))
    return(c(coef(rates.nls), modelr::rsquare(rates.nls, data = df_here), logLik(rates.nls)))
  },
  error=function(cond) {
    return(NA)
  })
}

fit_nls_curve_for_gene_offset <- function(gene){
  df_here <- data.frame(
    active = as.numeric(counts_active(data_here)[gene, ]) + 1, 
    inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1, 
    timepoint = as.numeric(data_here$ndDox), 
    experiment = data_here$Experiment
  ) %>% mutate(total = active + inactive) %>%
    mutate(rate = inactive / (active + inactive))
  tryCatch({
    rates.nls.offset <- nls(rate ~ a * exp(- k * timepoint) + b, data = df_here, algorithm = "port", start = c(a = 0.5, k = 0.1, b = 0.1), lower = c(a = -Inf, k = 0, b = 0))
    return(c(coef(rates.nls.offset), modelr::rsquare(rates.nls.offset, data = df_here), logLik(rates.nls.offset)))
  }, 
  error=function(cond) {
    return(NA)
  })
}

# Run the fits and compute half lifes (t1/2 = log(0.5) / (-k))
general_escapees <- data_here[escapees_use, ]

all_coefs <- data.frame(do.call("rbind", lapply(rownames(general_escapees), fit_nls_curve_for_gene)))
all_coefs$gene <- rownames(general_escapees)
all_coefs$halflifes <- log(1/2) / ( - all_coefs$k )

# 
all_coefs_offset <- data.frame(do.call("rbind", lapply(rownames(general_escapees), fit_nls_curve_for_gene_offset)))
all_coefs_offset$gene <- rownames(general_escapees)
all_coefs_offset$halflifes <- log(1/2) / ( - all_coefs_offset$k )

# how many convergence errors?
table(is.na(all_coefs$a), is.na(all_coefs_offset$a)) # there is a small amount of genes where the offset model doesnt converge, exclude these genes
##        
##         FALSE TRUE
##   FALSE   147   15
# the excluded genes are mainly pathological, e.g. non-monotonous data

all_coefs_offset$category <- rowData(data_here[all_coefs$gene, ])$EscapeAnnotation

# Merge and compute bayes information critera for both models
merged_df <- data.frame(
  gene = all_coefs$gene, 
  category = all_coefs_offset$category, 
  a = all_coefs$a, 
  k = all_coefs$k, 
  r2 = all_coefs$V3, 
  loglik = all_coefs$V4, 
  halflifes = all_coefs$halflifes, 
  a_off = all_coefs_offset$a, 
  k_off = all_coefs_offset$k, 
  b_off = all_coefs_offset$b, 
  r2_off = all_coefs_offset$V4, 
  loglik_off = all_coefs_offset$V5,
  halflifes_off = all_coefs_offset$halflifes
) %>%
  mutate(BIC_native = 3 * log(ncol(data_here)) - 2 * loglik) %>%
  mutate(BIC_offset = 4 * log(ncol(data_here)) - 2 * loglik_off)

# These genes fail convergence - all but 5 have very low r2 in the non-offset model
merged_df %>%
  dplyr::filter(is.na(a_off)) %>% 
  dplyr::arrange(r2)
##             gene            category          a          k            r2
## 1           Drp2         facultative 0.10134444 0.00000000 -2.220446e-16
## 2          Timp1         facultative 0.23694296 0.00000000 -2.220446e-16
## 3          Cox7b silenced / variable 0.19528361 0.00000000  0.000000e+00
## 4           Apoo silenced / variable 0.15819510 0.00000000  0.000000e+00
## 5           Mid1        constitutive 0.06238980 0.00000000  0.000000e+00
## 6            Jpx        constitutive 0.28544042 0.00000000  1.110223e-16
## 7        Gm14680                <NA> 0.34995245 0.01595671  1.529868e-02
## 8        Map3k15 silenced / variable 0.08368699 0.06303914  8.465674e-02
## 9  3110067C02Rik                <NA> 0.23396680 0.05688354  1.229768e-01
## 10         Hdac8 silenced / variable 0.09678076 0.07189294  2.606631e-01
## 11 5530601H04Rik        constitutive 0.34956170 0.06017532  4.253086e-01
## 12         Ap1s2         facultative 0.16989407 0.47278973  6.594223e-01
## 13         Car5b         facultative 0.13348331 1.32179266  8.304988e-01
## 14       Ccdc120 silenced / variable 0.15108646 0.67908118  9.158316e-01
## 15 A230072C01Rik         facultative 0.21724140 0.63039899  9.360657e-01
##        loglik  halflifes a_off k_off b_off r2_off loglik_off halflifes_off
## 1  12.3894854        Inf    NA    NA    NA     NA         NA            NA
## 2   1.7240939        Inf    NA    NA    NA     NA         NA            NA
## 3   9.6866505        Inf    NA    NA    NA     NA         NA            NA
## 4   9.3651397        Inf    NA    NA    NA     NA         NA            NA
## 5  18.1205953        Inf    NA    NA    NA     NA         NA            NA
## 6   0.4092039        Inf    NA    NA    NA     NA         NA            NA
## 7   3.6193003 43.4392210    NA    NA    NA     NA         NA            NA
## 8  17.5958620 10.9955058    NA    NA    NA     NA         NA            NA
## 9   9.6193858 12.1853730    NA    NA    NA     NA         NA            NA
## 10 24.4879403  9.6413803    NA    NA    NA     NA         NA            NA
## 11 12.9049880 11.5187960    NA    NA    NA     NA         NA            NA
## 12 21.1810642  1.4660792    NA    NA    NA     NA         NA            NA
## 13 27.1774098  0.5243993    NA    NA    NA     NA         NA            NA
## 14 30.4540347  1.0207133    NA    NA    NA     NA         NA            NA
## 15 27.1789323  1.0995373    NA    NA    NA     NA         NA            NA
##     BIC_native BIC_offset
## 1  -17.3242508         NA
## 2    4.0065321         NA
## 3  -11.9185811         NA
## 4  -11.2755595         NA
## 5  -28.7864707         NA
## 6    6.6363121         NA
## 7    0.2161193         NA
## 8  -27.7370041         NA
## 9  -11.7840516         NA
## 10 -41.5211607         NA
## 11 -18.3552560         NA
## 12 -34.9074085         NA
## 13 -46.9000996         NA
## 14 -53.4533495         NA
## 15 -46.9031446         NA
# Exclude 24 genes with r2 < 0.3, don't fit exponential model
merged_df %>% 
  dplyr::filter(!(r2 > 0.3 & r2_off > 0.3)) %>% dim()
## [1] 22 15
merged_df <- merged_df %>% 
  dplyr::filter(r2 > 0.3 & r2_off > 0.3)


# We now ask which genes fit the offset vs the native model better (supplement)
merged_df %>%
  ggplot(aes(x = BIC_native, BIC_offset, col = category)) + geom_point() + geom_abline() +
  ggrepel::geom_text_repel(aes(label = gene)) + theme_paper() +  xlab("BIC exponential decay model") +  ylab("BIC exponential decay model + offset") + 
    labs(col = "Escape category") + scale_color_manual(values = escape_colors)

ggsave("./FiguresIllustrator/FigS1/bic_model_choice.pdf", width = 9, height = 6)

merged_df %>%
  ggplot(aes(x = halflifes_off, b_off, col = category, shape = BIC_native > BIC_offset)) + 
    geom_point(size = 3) + ylim(c(0, 0.3)) + ggrepel::geom_text_repel(aes(label = gene), size = 5) + 
    theme_paper() + scale_color_manual(values = escape_colors) + theme_paper() + xlab("half-life (non-linear regression) [days]") + 
    ylab("offset parameter (non-linear regression)") + labs(col = "Escape category", shape = "Offset model favored (BIC)") + scale_x_log10()

ggsave("./FiguresIllustrator/Fig1/half_life_vs_baseline.pdf", width = 9, height = 6)

merged_df %>%
  dplyr::filter(!is.na(category)) %>%
  ggplot(aes(x = category, y = halflifes_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() + theme_paper() + 
  scale_fill_manual(values = escape_colors) + xlab("") + ylab("half-life (non-linear regression) [days]") + ggtitle("Escape half-life")

merged_df %>%
  dplyr::filter(!is.na(category)) %>%
  ggplot(aes(x = category, y = b_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() + theme_paper() + 
  scale_fill_manual(values = escape_colors) + xlab("") + ylab("offset parameter (non-linear regression)") + ggtitle("Residual escape")

merged_df %>%
  dplyr::filter(!is.na(category)) %>%
  ggplot(aes(x = category, y = halflifes_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() +theme_paper() + 
  scale_fill_manual(values = escape_colors) + xlab("") + ylab("half-life (non-linear regression) [days]") + ggtitle("Escape half-life") + 
  ggrepel::geom_text_repel(aes(label = gene))

ggsave("./FiguresIllustrator/Fig1/half_life_category.pdf", width = 9, height = 6)

merged_df %>%
  dplyr::filter(!is.na(category)) %>%
  ggplot(aes(x = category, y = b_off, fill = category)) + geom_boxplot(width = 0.1, outlier.color = NA) + ggbeeswarm::geom_quasirandom() + theme_paper() + 
  scale_fill_manual(values = escape_colors) + xlab("") + ylab("offset parameter (non-linear regression)") + ggtitle("Residual escape") + 
  ggrepel::geom_text_repel(aes(label = gene))

ggsave("./FiguresIllustrator/Fig1/residual_escape_category.pdf", width = 9, height = 6)
# Here we can check individual genes

plot_decay_gene <- function(gene, save_plot = F, path = "./"){

  df_here <- data.frame(
    active = as.numeric(counts_active(data_here)[gene, ]) + 1, 
    inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1, 
    timepoint = as.numeric(data_here$ndDox), 
    experiment = data_here$Experiment
  ) %>% mutate(total = active + inactive) %>% mutate(rate = inactive / (active + inactive))
  
  df_native <- all_coefs %>% column_to_rownames("gene")
  df_offset <- all_coefs_offset %>% column_to_rownames("gene")
  
  data.frame(
    active = as.numeric(counts_active(data_here)[gene, ]) + 1, 
    inactive = as.numeric(counts_inactive(data_here)[gene, ]) + 1, 
    timepoint = as.numeric(data_here$ndDox), 
    experiment = data_here$Experiment
  ) %>% mutate(rates = inactive / (inactive + active)) %>%
    ggplot(aes(x = timepoint, y = rates)) + geom_point() + theme_paper() + 
      geom_function(fun = function(x, a = df_native[gene, ]$a, k = df_native[gene, ]$k) exp_decay(x, a, k), 
                    linetype = "dashed", aes(col = "Exp. Decay Model")) + ylab("allelic ratio") + 
      geom_function(fun = function(x, a = df_offset[gene, ]$a, k = df_offset[gene, ]$k, c = df_offset[gene, ]$b) offset_exp_decay(x, a, k, c), 
                    linetype = "dashed", aes(col = "Exp. Decay Model + Offset")) + ylab("allelic ratio") + labs(colour = "Model") + 
      ylim(c(0, 1)) + ggtitle(gene) + theme(legend.position = c(0.8, 0.9)) + scale_color_manual(values = c("red", "darkred")) + 
      theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black")) +
      annotate("text", label = paste0("R^2: ", round(all_coefs[all_coefs$gene == gene, ]$V3, digits = 4)), x = 2, y = 1, size = 8) + 
      annotate("text", label = paste0("t1/2: ", round(all_coefs[all_coefs$gene == gene, ]$halflifes, digits = 4)), x = 2, y = 0.9, size = 8) + 
      xlab("Days after after Xist induction") + ylab("d-score (B6 / (B6 + CAST)") -> p1
  
    
  if (save_plot){
    ggsave(paste0(path, "decay_curve_", gene, ".pdf"), p1,  width = 9, height = 6)
  }
  
  return(p1)
 
}

plot_decay_gene("Pbdc1", save_plot = T, path = "./FiguresIllustrator/Fig1/")

plot_decay_gene("Med14", save_plot = T, path = "./FiguresIllustrator/Fig1/")

plot_decay_gene("Mecp2", save_plot = T, path = "./FiguresIllustrator/Fig1/")

plot_decay_gene("Tfe3", save_plot = T, path = "./FiguresIllustrator/Fig1/")

genes_plot <- merged_df[merged_df$category == "constitutive", ]$gene
genes_plot <- genes_plot[!is.na(genes_plot)]

for (gene in genes_plot){
  plot_decay_gene(gene, save_plot = T, path = "./FiguresIllustrator/Fig1/")
}